#include "nnie.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <math.h>

#include "mpi_sys.h"
#include "mpi_nnie.h"
#include "mpi_ive.h"
#include "mpi_vb.h"
#include "hi_comm_video.h"

#include "arm_neon.h"

#define ALIGNX(u32Num, X) ((u32Num + X - 1) / X * X)
#define ALIGN(u32Num) ALIGNX(u32Num, 16)

#define SVP_NNIE_MAX_MEM 0xFFFFFFFF
#define SVP_NNIE_QUANT_BASE 4096
#define SVP_NNIE_COORDI_NUM 4
#define SVP_NNIE_X_MIN_OFFSET 0
#define SVP_NNIE_Y_MIN_OFFSET 1
#define SVP_NNIE_X_MAX_OFFSET 2
#define SVP_NNIE_Y_MAX_OFFSET 3
#define SVP_NNIE_SCORE_OFFSET 4
#define SVP_NNIE_SUPPRESS_FLAG_OFFSET 5

#define SVP_NNIE_MAX(a, b) (((a) > (b)) ? (a) : (b))
#define SVP_NNIE_MIN(a, b) (((a) < (b)) ? (a) : (b))
#define SVP_NNIE_SIGMOID(x) (HI_FLOAT)((1.0f) / (1 + exp(-(x))))

#define SVP_YOLO_REPORT_BLOB_NUM 3
#define SVP_YOLO_EACH_BBOX_INFER_RESULT_NUM 85
#define SVP_YOLO_VERBOSE_THRESHOLD 0.8f

#define SVP_SR_UPSCALE_FACTOR 4
#define SVP_SR_VB_ID 2

typedef struct
{
    HI_S32 s32Min;
    HI_S32 s32Max;
} SVP_NNIE_STACK_S;

typedef struct
{
    HI_FLOAT f32Xmin;
    HI_FLOAT f32Xmax;
    HI_FLOAT f32Ymin;
    HI_FLOAT f32Ymax;
    HI_S32 s32ClsScore;
    HI_U32 u32ClassIdx;
    HI_U32 u32Mask;
} SVP_YOLO_BBOX_S;

HI_S32 SVP_NNIE_Init(HI_CHAR *pcModelName, SVP_NNIE_PARAM_S *pstNnieParam)
{
    HI_S32 s32Ret;

    if (pstNnieParam == HI_NULL)
    {
        printf("[FAIL] pstNnieParam is NULL\n");
        return HI_FAILURE;
    }

    FILE *fp = fopen(pcModelName, "rb");
    if (fp == HI_NULL)
    {
        printf("[FAIL] fopen: \"%s\"\n", pcModelName);
        return HI_FAILURE;
    }

    s32Ret = fseek(fp, 0L, SEEK_END);
    if (s32Ret != 0)
    {
        printf("[FAIL] fseek failed!\n");
        fclose(fp);
        return HI_FAILURE;
    }

    HI_SL slFileSize = ftell(fp);
    if (slFileSize <= 0)
    {
        printf("[FAIL] ftell failed!\n");
        fclose(fp);
        return HI_FAILURE;
    }

    s32Ret = fseek(fp, 0L, SEEK_SET);
    if (s32Ret != 0)
    {
        printf("[FAIL] fseek failed!\n");
        fclose(fp);
        return HI_FAILURE;
    }

    HI_U64 u64PhyAddr = 0;
    HI_U8 *pu8VirAddr = HI_NULL;

    s32Ret = HI_MPI_SYS_MmzAlloc((HI_U64 *)&u64PhyAddr, (HI_VOID **)&pu8VirAddr, "NNIE_MODEL", HI_NULL, slFileSize);
    if (HI_SUCCESS != s32Ret)
    {
        printf("[FAIL] HI_MPI_SYS_MmzAlloc: %x\n", s32Ret);
        fclose(fp);
        return HI_FAILURE;
    }

    pstNnieParam->stModelBuf.u32Size = (HI_U32)slFileSize;
    pstNnieParam->stModelBuf.u64PhyAddr = u64PhyAddr;
    pstNnieParam->stModelBuf.u64VirAddr = (HI_U64)(HI_UINTPTR_T)pu8VirAddr;

    s32Ret = fread(pu8VirAddr, slFileSize, 1, fp);
    if (s32Ret != 1)
    {
        printf("[FAIL] fread failed!\n");
        fclose(fp);
        return HI_FAILURE;
    }
    fclose(fp);

    s32Ret = HI_MPI_SVP_NNIE_LoadModel(&pstNnieParam->stModelBuf, &pstNnieParam->stModel);
    if (HI_SUCCESS != s32Ret)
    {
        printf("[FAIL] HI_MPI_SVP_NNIE_LoadModel: %x\n", s32Ret);
        return HI_FAILURE;
    }

    HI_U32 u32Num = 0;
    for (HI_S32 i = 0; i < pstNnieParam->stModel.u32NetSegNum; i++)
    {
        if (pstNnieParam->stModel.astSeg[i].enNetType == SVP_NNIE_NET_TYPE_ROI)
        {
            pstNnieParam->astForwardWithBboxCtrl[i].enNnieId = SVP_NNIE_ID_0;
            pstNnieParam->astForwardWithBboxCtrl[i].u32SrcNum = pstNnieParam->stModel.astSeg[i].u16SrcNum;
            pstNnieParam->astForwardWithBboxCtrl[i].u32DstNum = pstNnieParam->stModel.astSeg[i].u16DstNum;
            pstNnieParam->astForwardWithBboxCtrl[i].u32ProposalNum = 1;
            pstNnieParam->astForwardWithBboxCtrl[i].u32NetSegId = i;
        }
        else if ((pstNnieParam->stModel.astSeg[i].enNetType == SVP_NNIE_NET_TYPE_CNN) ||
                 (pstNnieParam->stModel.astSeg[i].enNetType == SVP_NNIE_NET_TYPE_RECURRENT))
        {
            pstNnieParam->astForwardCtrl[i].enNnieId = SVP_NNIE_ID_0;
            pstNnieParam->astForwardCtrl[i].u32SrcNum = pstNnieParam->stModel.astSeg[i].u16SrcNum;
            pstNnieParam->astForwardCtrl[i].u32DstNum = pstNnieParam->stModel.astSeg[i].u16DstNum;
            pstNnieParam->astForwardCtrl[i].u32NetSegId = i;
        }

        for (HI_S32 j = 0; j < pstNnieParam->stModel.astSeg[i].u16SrcNum; j++)
        {
            if (pstNnieParam->stModel.astSeg[i].astSrcNode[j].enType == SVP_BLOB_TYPE_SEQ_S32)
            {
                pstNnieParam->astSegData[i].astSrc[j].enType = pstNnieParam->stModel.astSeg[i].astSrcNode[j].enType;
                pstNnieParam->astSegData[i].astSrc[j].unShape.stSeq.u32Dim =
                    pstNnieParam->stModel.astSeg[i].astSrcNode[j].unShape.u32Dim;
                pstNnieParam->astSegData[i].astSrc[j].u32Num = pstNnieParam->u32MaxInputNum;
                pstNnieParam->astSegData[i].astSrc[j].unShape.stSeq.u64VirAddrStep =
                    pstNnieParam->au64StepVirAddr[i * SVP_NNIE_EACH_SEG_STEP_ADDR_NUM];
            }
            else
            {
                pstNnieParam->astSegData[i].astSrc[j].enType = pstNnieParam->stModel.astSeg[i].astSrcNode[j].enType;
                pstNnieParam->astSegData[i].astSrc[j].unShape.stWhc.u32Chn =
                    pstNnieParam->stModel.astSeg[i].astSrcNode[j].unShape.stWhc.u32Chn;
                pstNnieParam->astSegData[i].astSrc[j].unShape.stWhc.u32Height =
                    pstNnieParam->stModel.astSeg[i].astSrcNode[j].unShape.stWhc.u32Height;
                pstNnieParam->astSegData[i].astSrc[j].unShape.stWhc.u32Width =
                    pstNnieParam->stModel.astSeg[i].astSrcNode[j].unShape.stWhc.u32Width;
                pstNnieParam->astSegData[i].astSrc[j].u32Num = pstNnieParam->u32MaxInputNum;
            }
        }

        if (pstNnieParam->stModel.astSeg[i].enNetType == SVP_NNIE_NET_TYPE_ROI)
        {
            if ((HI_U64)pstNnieParam->u32MaxRoiNum * pstNnieParam->u32MaxInputNum > SVP_NNIE_MAX_MEM)
            {
                printf("[FAIL] pstNnieParam->u32MaxRoiNum * pstNnieParam->u32MaxInputNum should be less than %x\n", SVP_NNIE_MAX_MEM);
                return HI_FAILURE;
            }
            u32Num = pstNnieParam->u32MaxRoiNum * pstNnieParam->u32MaxInputNum;
        }
        else
        {
            u32Num = pstNnieParam->u32MaxInputNum;
        }

        for (HI_S32 j = 0; j < pstNnieParam->stModel.astSeg[i].u16DstNum; j++)
        {
            if (pstNnieParam->stModel.astSeg[i].astDstNode[j].enType == SVP_BLOB_TYPE_SEQ_S32)
            {
                pstNnieParam->astSegData[i].astDst[j].enType = pstNnieParam->stModel.astSeg[i].astDstNode[j].enType;
                pstNnieParam->astSegData[i].astDst[j].unShape.stSeq.u32Dim =
                    pstNnieParam->stModel.astSeg[i].astDstNode[j].unShape.u32Dim;
                pstNnieParam->astSegData[i].astDst[j].u32Num = u32Num;
                pstNnieParam->astSegData[i].astDst[j].unShape.stSeq.u64VirAddrStep =
                    pstNnieParam->au64StepVirAddr[i * SVP_NNIE_EACH_SEG_STEP_ADDR_NUM + 1];
            }
            else
            {
                pstNnieParam->astSegData[i].astDst[j].enType = pstNnieParam->stModel.astSeg[i].astDstNode[j].enType;
                pstNnieParam->astSegData[i].astDst[j].unShape.stWhc.u32Chn =
                    pstNnieParam->stModel.astSeg[i].astDstNode[j].unShape.stWhc.u32Chn;
                pstNnieParam->astSegData[i].astDst[j].unShape.stWhc.u32Height =
                    pstNnieParam->stModel.astSeg[i].astDstNode[j].unShape.stWhc.u32Height;
                pstNnieParam->astSegData[i].astDst[j].unShape.stWhc.u32Width =
                    pstNnieParam->stModel.astSeg[i].astDstNode[j].unShape.stWhc.u32Width;
                pstNnieParam->astSegData[i].astDst[j].u32Num = u32Num;
            }
        }
    }

    HI_U32 aau32SrcSize[SVP_NNIE_MAX_NET_SEG_NUM][SVP_NNIE_MAX_INPUT_NUM] = {0};
    HI_U32 aau32DstSize[SVP_NNIE_MAX_NET_SEG_NUM][SVP_NNIE_MAX_OUTPUT_NUM] = {0};

    s32Ret = HI_MPI_SVP_NNIE_GetTskBufSize(pstNnieParam->u32MaxInputNum, pstNnieParam->u32MaxRoiNum, &pstNnieParam->stModel,
                                           pstNnieParam->au32TaskBufSize, pstNnieParam->stModel.u32NetSegNum);
    if (HI_SUCCESS != s32Ret)
    {
        printf("[FAIL] HI_MPI_SVP_NNIE_GetTskBufSize: %x\n", s32Ret);
        return HI_FAILURE;
    }

    HI_U32 u32TotalTaskBufSize = 0;
    for (HI_S32 i = 0; i < pstNnieParam->stModel.u32NetSegNum; i++)
    {
        if ((HI_U64)u32TotalTaskBufSize + pstNnieParam->au32TaskBufSize[i] > SVP_NNIE_MAX_MEM)
        {
            printf("[FAIL] u32TotalTaskBufSize + pstNnieParam->au32TaskBufSize[i] should be less than %x\n", SVP_NNIE_MAX_MEM);
            return HI_FAILURE;
        }
        u32TotalTaskBufSize += pstNnieParam->au32TaskBufSize[i];
    }

    HI_U32 u32TmpBufSize = pstNnieParam->stModel.u32TmpBufSize;
    if ((HI_U64)u32TotalTaskBufSize + u32TmpBufSize > SVP_NNIE_MAX_MEM)
    {
        printf("[FAIL] u32TotalTaskBufSize + u32TmpBufSize should be less than %x\n", SVP_NNIE_MAX_MEM);
        return HI_FAILURE;
    }

    HI_U32 u32TotalSize = u32TotalTaskBufSize + u32TmpBufSize;

    HI_U64 u64TotalStep = 0;
    for (HI_S32 i = 0; i < pstNnieParam->stModel.u32NetSegNum && i < SVP_NNIE_MAX_NET_SEG_NUM; i++)
    {
        if (pstNnieParam->stModel.astSeg[i].enNetType == SVP_NNIE_NET_TYPE_RECURRENT)
        {
            for (HI_S32 j = 0; j < pstNnieParam->astSegData[i].astSrc[0].u32Num; j++)
            {
                u64TotalStep += *((HI_S32 *)(HI_UINTPTR_T)pstNnieParam->astSegData[i].astSrc[0].unShape.stSeq.u64VirAddrStep + j);
                if (u64TotalStep > SVP_NNIE_MAX_MEM)
                {
                    printf("[FAIL] u64TotalStep should be less than %x\n", SVP_NNIE_MAX_MEM);
                    return HI_FAILURE;
                }
            }
        }
        if (i == 0)
        {
            HI_U64 u64Size = 0;
            HI_U64 u64Stride = 0;
            for (HI_S32 j = 0; j < pstNnieParam->stModel.astSeg[i].u16SrcNum; j++)
            {
                if (SVP_BLOB_TYPE_S32 == pstNnieParam->stModel.astSeg[i].astSrcNode[j].enType ||
                    SVP_BLOB_TYPE_VEC_S32 == pstNnieParam->stModel.astSeg[i].astSrcNode[j].enType ||
                    SVP_BLOB_TYPE_SEQ_S32 == pstNnieParam->stModel.astSeg[i].astSrcNode[j].enType)
                {
                    u64Size = sizeof(HI_U32);
                }
                else
                {
                    u64Size = sizeof(HI_U8);
                }

                if (SVP_BLOB_TYPE_SEQ_S32 == pstNnieParam->stModel.astSeg[i].astSrcNode[j].enType)
                {
                    u64Stride = ALIGN(pstNnieParam->stModel.astSeg[i].astSrcNode[j].unShape.u32Dim * u64Size);
                    if (u64Stride > SVP_NNIE_MAX_MEM)
                    {
                        printf("[FAIL] u64Stride should be less than %x\n", SVP_NNIE_MAX_MEM);
                        return HI_FAILURE;
                    }

                    u64Size = u64TotalStep * u64Stride;
                    if (u64Size > SVP_NNIE_MAX_MEM)
                    {
                        printf("[FAIL] u64Size should be less than %x\n", SVP_NNIE_MAX_MEM);
                        return HI_FAILURE;
                    }
                    aau32SrcSize[i][j] = (HI_U32)u64Size;
                }
                else
                {
                    u64Stride = ALIGN(pstNnieParam->stModel.astSeg[i].astSrcNode[j].unShape.stWhc.u32Width * u64Size);
                    if (u64Stride > SVP_NNIE_MAX_MEM)
                    {
                        printf("[FAIL] u64Stride should be less than %x\n", SVP_NNIE_MAX_MEM);
                        return HI_FAILURE;
                    }

                    u64Size = pstNnieParam->astSegData[i].astSrc[j].u32Num * u64Stride;
                    if (u64Size > SVP_NNIE_MAX_MEM)
                    {
                        printf("[FAIL] u64Size should be less than %x\n", SVP_NNIE_MAX_MEM);
                        return HI_FAILURE;
                    }

                    u64Size *= pstNnieParam->stModel.astSeg[i].astSrcNode[j].unShape.stWhc.u32Height;
                    if (u64Size > SVP_NNIE_MAX_MEM)
                    {
                        printf("[FAIL] u64Size should be less than %x\n", SVP_NNIE_MAX_MEM);
                        return HI_FAILURE;
                    }

                    u64Size *= pstNnieParam->stModel.astSeg[i].astSrcNode[j].unShape.stWhc.u32Chn;
                    if (u64Size > SVP_NNIE_MAX_MEM)
                    {
                        printf("[FAIL] u64Size should be less than %x\n", SVP_NNIE_MAX_MEM);
                        return HI_FAILURE;
                    }
                    aau32SrcSize[i][j] = (HI_U32)u64Size;
                }

                if ((HI_U64)u32TotalSize + aau32SrcSize[i][j] > SVP_NNIE_MAX_MEM)
                {
                    printf("[FAIL] u32TotalSize + aau32SrcSize[%d][%d] should be less than %x\n", i, j, SVP_NNIE_MAX_MEM);
                    return HI_FAILURE;
                }

                u32TotalSize += (HI_U32)aau32SrcSize[i][j];
                pstNnieParam->astSegData[i].astSrc[j].u32Stride = (HI_U32)u64Stride;
            }
        }

        HI_U64 u64Size = 0;
        HI_U64 u64Stride = 0;
        for (HI_S32 j = 0; j < pstNnieParam->stModel.astSeg[i].u16DstNum; j++)
        {
            if (SVP_BLOB_TYPE_S32 == pstNnieParam->stModel.astSeg[i].astDstNode[j].enType ||
                SVP_BLOB_TYPE_VEC_S32 == pstNnieParam->stModel.astSeg[i].astDstNode[j].enType ||
                SVP_BLOB_TYPE_SEQ_S32 == pstNnieParam->stModel.astSeg[i].astDstNode[j].enType)
            {
                u64Size = sizeof(HI_U32);
            }
            else
            {
                u64Size = sizeof(HI_U8);
            }

            if (SVP_BLOB_TYPE_SEQ_S32 == pstNnieParam->stModel.astSeg[i].astDstNode[j].enType)
            {
                u64Stride = ALIGN(pstNnieParam->stModel.astSeg[i].astDstNode[j].unShape.u32Dim * u64Size);
                if (u64Stride > SVP_NNIE_MAX_MEM)
                {
                    printf("[FAIL] u64Stride should be less than %x\n", SVP_NNIE_MAX_MEM);
                    return HI_FAILURE;
                }

                u64Size = u64TotalStep * u64Stride;
                if (u64Size > SVP_NNIE_MAX_MEM)
                {
                    printf("[FAIL] u64Size should be less than %x\n", SVP_NNIE_MAX_MEM);
                    return HI_FAILURE;
                }
                aau32DstSize[i][j] = (HI_U32)u64Size;
            }
            else
            {
                u64Stride = ALIGN(pstNnieParam->stModel.astSeg[i].astDstNode[j].unShape.stWhc.u32Width * u64Size);
                if (u64Stride > SVP_NNIE_MAX_MEM)
                {
                    printf("[FAIL] u64Stride should be less than %x\n", SVP_NNIE_MAX_MEM);
                    return HI_FAILURE;
                }

                u64Size = pstNnieParam->astSegData[i].astDst[j].u32Num * u64Stride;
                if (u64Size > SVP_NNIE_MAX_MEM)
                {
                    printf("[FAIL] u64Size should be less than %x\n", SVP_NNIE_MAX_MEM);
                    return HI_FAILURE;
                }

                u64Size *= pstNnieParam->stModel.astSeg[i].astDstNode[j].unShape.stWhc.u32Height;
                if (u64Size > SVP_NNIE_MAX_MEM)
                {
                    printf("[FAIL] u64Size should be less than %x\n", SVP_NNIE_MAX_MEM);
                    return HI_FAILURE;
                }

                u64Size *= pstNnieParam->stModel.astSeg[i].astDstNode[j].unShape.stWhc.u32Chn;
                if (u64Size > SVP_NNIE_MAX_MEM)
                {
                    printf("[FAIL] u64Size should be less than %x\n", SVP_NNIE_MAX_MEM);
                    return HI_FAILURE;
                }
                aau32DstSize[i][j] = (HI_U32)u64Size;
            }

            if ((HI_U64)u32TotalSize + aau32DstSize[i][j] > SVP_NNIE_MAX_MEM)
            {
                printf("[FAIL] u32TotalSize + aau32SrcSize[%d][%d] should be less than %x\n", i, j, SVP_NNIE_MAX_MEM);
                return HI_FAILURE;
            }

            u32TotalSize += aau32DstSize[i][j];
            pstNnieParam->astSegData[i].astDst[j].u32Stride = (HI_U32)u64Stride;
        }
    }

    u64PhyAddr = 0;
    pu8VirAddr = HI_NULL;
    s32Ret = HI_MPI_SYS_MmzAlloc_Cached((HI_U64 *)&u64PhyAddr, (HI_VOID **)&pu8VirAddr, "NNIE_TASK", HI_NULL, u32TotalSize);
    if (HI_SUCCESS != s32Ret)
    {
        printf("[FAIL] HI_MPI_SYS_MmzAlloc_Cached: %x\n", s32Ret);
        return HI_FAILURE;
    }
    memset_s(pu8VirAddr, u32TotalSize, 0, u32TotalSize);

    HI_MPI_SYS_MmzFlushCache(u64PhyAddr, (HI_VOID *)pu8VirAddr, u32TotalSize);

    pstNnieParam->stTaskBuf.u32Size = u32TotalTaskBufSize;
    pstNnieParam->stTaskBuf.u64PhyAddr = u64PhyAddr;
    pstNnieParam->stTaskBuf.u64VirAddr = (HI_U64)(HI_UINTPTR_T)pu8VirAddr;

    pstNnieParam->stTmpBuf.u32Size = u32TmpBufSize;
    pstNnieParam->stTmpBuf.u64PhyAddr = u64PhyAddr + u32TotalTaskBufSize;
    pstNnieParam->stTmpBuf.u64VirAddr = (HI_U64)(HI_UINTPTR_T)pu8VirAddr + u32TotalTaskBufSize;

    HI_U32 u32Offset = 0;
    for (HI_S32 i = 0; i < pstNnieParam->stModel.u32NetSegNum; i++)
    {
        if (pstNnieParam->stModel.astSeg[i].enNetType == SVP_NNIE_NET_TYPE_ROI)
        {
            pstNnieParam->astForwardWithBboxCtrl[i].stTmpBuf = pstNnieParam->stTmpBuf;
            pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf.u64PhyAddr = pstNnieParam->stTaskBuf.u64PhyAddr + u32Offset;
            pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf.u64VirAddr = pstNnieParam->stTaskBuf.u64VirAddr + u32Offset;
            pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf.u32Size = pstNnieParam->au32TaskBufSize[i];
        }
        else if ((pstNnieParam->stModel.astSeg[i].enNetType == SVP_NNIE_NET_TYPE_CNN) ||
                 (pstNnieParam->stModel.astSeg[i].enNetType == SVP_NNIE_NET_TYPE_RECURRENT))
        {
            pstNnieParam->astForwardCtrl[i].stTmpBuf = pstNnieParam->stTmpBuf;
            pstNnieParam->astForwardCtrl[i].stTskBuf.u64PhyAddr = pstNnieParam->stTaskBuf.u64PhyAddr + u32Offset;
            pstNnieParam->astForwardCtrl[i].stTskBuf.u64VirAddr = pstNnieParam->stTaskBuf.u64VirAddr + u32Offset;
            pstNnieParam->astForwardCtrl[i].stTskBuf.u32Size = pstNnieParam->au32TaskBufSize[i];
        }
        u32Offset += pstNnieParam->au32TaskBufSize[i];
    }

    u64PhyAddr = u64PhyAddr + u32TotalTaskBufSize + u32TmpBufSize;
    pu8VirAddr = pu8VirAddr + u32TotalTaskBufSize + u32TmpBufSize;
    for (HI_S32 i = 0; i < pstNnieParam->stModel.u32NetSegNum; i++)
    {
        if (i == 0)
        {
            for (HI_S32 j = 0; j < pstNnieParam->stModel.astSeg[i].u16SrcNum; j++)
            {
                if (j != 0)
                {
                    u64PhyAddr += aau32SrcSize[i][j - 1];
                    pu8VirAddr += aau32SrcSize[i][j - 1];
                }
                pstNnieParam->astSegData[i].astSrc[j].u64PhyAddr = u64PhyAddr;
                pstNnieParam->astSegData[i].astSrc[j].u64VirAddr = (HI_U64)(HI_UINTPTR_T)pu8VirAddr;
            }
            u64PhyAddr += aau32SrcSize[i][pstNnieParam->stModel.astSeg[i].u16SrcNum - 1];
            pu8VirAddr += aau32SrcSize[i][pstNnieParam->stModel.astSeg[i].u16SrcNum - 1];
        }

        for (HI_S32 j = 0; j < pstNnieParam->stModel.astSeg[i].u16DstNum; j++)
        {
            if (j != 0)
            {
                u64PhyAddr += aau32DstSize[i][j - 1];
                pu8VirAddr += aau32DstSize[i][j - 1];
            }
            pstNnieParam->astSegData[i].astDst[j].u64PhyAddr = u64PhyAddr;
            pstNnieParam->astSegData[i].astDst[j].u64VirAddr = (HI_U64)(HI_UINTPTR_T)pu8VirAddr;
        }
        u64PhyAddr += aau32DstSize[i][pstNnieParam->stModel.astSeg[i].u16DstNum - 1];
        pu8VirAddr += aau32DstSize[i][pstNnieParam->stModel.astSeg[i].u16DstNum - 1];
    }

    for (HI_S32 i = 0; i < pstNnieParam->stModel.u32NetSegNum; i++)
    {
        if (pstNnieParam->stModel.astSeg[i].enNetType == SVP_NNIE_NET_TYPE_ROI)
        {
            s32Ret = HI_MPI_SVP_NNIE_AddTskBuf(&pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf);
            if (HI_SUCCESS != s32Ret)
            {
                printf("[FAIL] HI_MPI_SVP_NNIE_AddTskBuf: %x\n", s32Ret);
                return HI_FAILURE;
            }
        }
        else if ((pstNnieParam->stModel.astSeg[i].enNetType == SVP_NNIE_NET_TYPE_CNN) ||
                 (pstNnieParam->stModel.astSeg[i].enNetType == SVP_NNIE_NET_TYPE_RECURRENT))
        {
            s32Ret = HI_MPI_SVP_NNIE_AddTskBuf(&pstNnieParam->astForwardCtrl[i].stTskBuf);
            if (HI_SUCCESS != s32Ret)
            {
                printf("[FAIL] HI_MPI_SVP_NNIE_AddTskBuf: %x\n", s32Ret);
                return HI_FAILURE;
            }
        }
    }

    return HI_SUCCESS;
}

HI_VOID SVP_NNIE_DeInit(SVP_NNIE_PARAM_S *pstNnieParam)
{
    HI_S32 s32Ret;

    if (pstNnieParam == HI_NULL)
    {
        printf("[FAIL] pstNnieParam is NULL\n");
        return;
    }

    for (HI_S32 i = 0; i < pstNnieParam->stModel.u32NetSegNum; i++)
    {
        if (pstNnieParam->stModel.astSeg[i].enNetType == SVP_NNIE_NET_TYPE_ROI)
        {
            s32Ret = HI_MPI_SVP_NNIE_RemoveTskBuf(&pstNnieParam->astForwardWithBboxCtrl[i].stTskBuf);
            if (HI_SUCCESS != s32Ret)
            {
                printf("[FAIL] HI_MPI_SVP_NNIE_RemoveTskBuf: %x\n", s32Ret);
            }
        }
        else if ((pstNnieParam->stModel.astSeg[i].enNetType == SVP_NNIE_NET_TYPE_CNN) ||
                 (pstNnieParam->stModel.astSeg[i].enNetType == SVP_NNIE_NET_TYPE_RECURRENT))
        {
            s32Ret = HI_MPI_SVP_NNIE_RemoveTskBuf(&pstNnieParam->astForwardCtrl[i].stTskBuf);
            if (HI_SUCCESS != s32Ret)
            {
                printf("[FAIL] HI_MPI_SVP_NNIE_RemoveTskBuf: %x\n", s32Ret);
            }
        }
    }

    if (pstNnieParam->stTaskBuf.u64PhyAddr != 0 && pstNnieParam->stTaskBuf.u64VirAddr != 0)
    {
        HI_MPI_SYS_MmzFree(pstNnieParam->stTaskBuf.u64PhyAddr, (HI_VOID *)(HI_UINTPTR_T)pstNnieParam->stTaskBuf.u64VirAddr);
        pstNnieParam->stTaskBuf.u64PhyAddr = 0;
        pstNnieParam->stTaskBuf.u64VirAddr = 0;
    }
    pstNnieParam->stTaskBuf.u32Size = 0;

    if (pstNnieParam->stStepBuf.u64PhyAddr != 0 && pstNnieParam->stStepBuf.u64VirAddr != 0)
    {
        HI_MPI_SYS_MmzFree(pstNnieParam->stStepBuf.u64PhyAddr, (HI_VOID *)(HI_UINTPTR_T)pstNnieParam->stStepBuf.u64VirAddr);
        pstNnieParam->stStepBuf.u64PhyAddr = 0;
        pstNnieParam->stStepBuf.u64VirAddr = 0;
    }
    pstNnieParam->stStepBuf.u32Size = 0;

    if (pstNnieParam->stModelBuf.u64PhyAddr != 0 && pstNnieParam->stModelBuf.u64VirAddr != 0)
    {
        HI_MPI_SYS_MmzFree(pstNnieParam->stModelBuf.u64PhyAddr, (HI_VOID *)(HI_UINTPTR_T)pstNnieParam->stModelBuf.u64VirAddr);
        pstNnieParam->stModelBuf.u64PhyAddr = 0;
        pstNnieParam->stModelBuf.u64VirAddr = 0;
    }

    pstNnieParam->stModelBuf.u32Size = 0;
}

HI_S32 SVP_NNIE_Forward(SVP_NNIE_PARAM_S *pstNnieParam, VIDEO_FRAME_INFO_S *pstFrmInfo, HI_U32 u32SegIdx, HI_U32 u32NodeIdx)
{
    HI_S32 s32Ret;

    if (pstNnieParam == HI_NULL)
    {
        printf("[FAIL] pstNnieParam is NULL\n");
        return HI_FAILURE;
    }

    if (pstFrmInfo == HI_NULL)
    {
        printf("[FAIL] pstFrmInfo is NULL\n");
        return HI_FAILURE;
    }

    if (!(u32SegIdx >= 0 && u32SegIdx < SVP_NNIE_MAX_NET_SEG_NUM))
    {
        printf("[FAIL] u32SegIdx is illegal: %d\n", u32SegIdx);
        return HI_FAILURE;
    }

    if (!(u32NodeIdx >= 0 && u32NodeIdx < SVP_NNIE_MAX_INPUT_NUM))
    {
        printf("[FAIL] u32NodeIdx is illegal: %d\n", u32NodeIdx);
        return HI_FAILURE;
    }

    pstNnieParam->astSegData[u32SegIdx].astSrc[u32NodeIdx].u64VirAddr = pstFrmInfo->stVFrame.u64VirAddr[0];
    pstNnieParam->astSegData[u32SegIdx].astSrc[u32NodeIdx].u64PhyAddr = pstFrmInfo->stVFrame.u64PhyAddr[0];
    pstNnieParam->astSegData[u32SegIdx].astSrc[u32NodeIdx].u32Stride = pstFrmInfo->stVFrame.u32Stride[0];

    HI_MPI_SYS_MmzFlushCache(pstNnieParam->astSegData[u32SegIdx].astSrc[u32NodeIdx].u64PhyAddr,
                             (HI_VOID *)(HI_UINTPTR_T)pstNnieParam->astSegData[u32SegIdx].astSrc[u32NodeIdx].u64VirAddr,
                             pstNnieParam->astSegData[u32SegIdx].astSrc[u32NodeIdx].u32Num *
                                 pstNnieParam->astSegData[u32SegIdx].astSrc[u32NodeIdx].unShape.stWhc.u32Chn *
                                 pstNnieParam->astSegData[u32SegIdx].astSrc[u32NodeIdx].unShape.stWhc.u32Height *
                                 pstNnieParam->astSegData[u32SegIdx].astSrc[u32NodeIdx].u32Stride);

    HI_MPI_SYS_MmzFlushCache(pstNnieParam->astForwardCtrl[u32SegIdx].stTskBuf.u64PhyAddr,
                             (HI_VOID *)(HI_UINTPTR_T)pstNnieParam->astForwardCtrl[u32SegIdx].stTskBuf.u64VirAddr,
                             pstNnieParam->astForwardCtrl[u32SegIdx].stTskBuf.u32Size);

    HI_U32 u32TotalStepNum = 0;
    for (HI_S32 i = 0; i < pstNnieParam->astForwardCtrl[u32SegIdx].u32DstNum; i++)
    {
        if (pstNnieParam->astSegData[u32SegIdx].astDst[i].enType == SVP_BLOB_TYPE_SEQ_S32)
        {
            for (HI_S32 j = 0; j < pstNnieParam->astSegData[u32SegIdx].astDst[i].u32Num; j++)
            {
                u32TotalStepNum += *((HI_U32 *)(HI_UINTPTR_T)pstNnieParam->astSegData[u32SegIdx].astDst[i].unShape.stSeq.u64VirAddrStep + j);
            }
            HI_MPI_SYS_MmzFlushCache(pstNnieParam->astSegData[u32SegIdx].astDst[i].u64PhyAddr,
                                     (HI_VOID *)(HI_UINTPTR_T)pstNnieParam->astSegData[u32SegIdx].astDst[i].u64VirAddr,
                                     u32TotalStepNum * pstNnieParam->astSegData[u32SegIdx].astDst[i].u32Stride);
        }
        else
        {
            HI_MPI_SYS_MmzFlushCache(pstNnieParam->astSegData[u32SegIdx].astDst[i].u64PhyAddr,
                                     (HI_VOID *)(HI_UINTPTR_T)pstNnieParam->astSegData[u32SegIdx].astDst[i].u64VirAddr,
                                     pstNnieParam->astSegData[u32SegIdx].astDst[i].u32Num *
                                         pstNnieParam->astSegData[u32SegIdx].astDst[i].unShape.stWhc.u32Chn *
                                         pstNnieParam->astSegData[u32SegIdx].astDst[i].unShape.stWhc.u32Height *
                                         pstNnieParam->astSegData[u32SegIdx].astDst[i].u32Stride);
        }
    }

    SVP_NNIE_HANDLE hNnieHandle = 0;
    s32Ret = HI_MPI_SVP_NNIE_Forward(&hNnieHandle,
                                     pstNnieParam->astSegData[u32SegIdx].astSrc,
                                     &pstNnieParam->stModel,
                                     pstNnieParam->astSegData[u32SegIdx].astDst,
                                     &pstNnieParam->astForwardCtrl[u32SegIdx],
                                     HI_TRUE);
    if (HI_SUCCESS != s32Ret)
    {
        printf("[FAIL] HI_MPI_SVP_NNIE_Forward: %x\n", s32Ret);
        return HI_FAILURE;
    }

    HI_BOOL bFinish = HI_FALSE;
    do
    {
        usleep(100);
        while (HI_ERR_SVP_NNIE_QUERY_TIMEOUT == HI_MPI_SVP_NNIE_Query(pstNnieParam->astForwardCtrl[u32SegIdx].enNnieId, hNnieHandle, &bFinish, HI_TRUE))
        {
            printf("[WARNING] HI_MPI_SVP_NNIE_Query: HI_ERR_SVP_NNIE_QUERY_TIMEOUT\n");
            usleep(100);
        }

    } while (HI_TRUE != bFinish);

    return HI_SUCCESS;
}

HI_S32 SVP_NNIE_Yolo_Init(SVP_NNIE_PARAM_S *pstNnieParam, SVP_YOLO_PARAM_S *pstYoloParam)
{
    HI_S32 s32Ret;

    if (pstNnieParam == HI_NULL)
    {
        printf("[FAIL] pstNnieParam is NULL\n");
        return HI_FAILURE;
    }

    if (pstYoloParam == HI_NULL)
    {
        printf("[FAIL] pstYoloParam is NULL\n");
        return HI_FAILURE;
    }

    pstYoloParam->u32OriImHeight = pstNnieParam->astSegData[0].astSrc[0].unShape.stWhc.u32Height;
    pstYoloParam->u32OriImWidth = pstNnieParam->astSegData[0].astSrc[0].unShape.stWhc.u32Width;
    pstYoloParam->u32BboxNumEachGrid = 3;
    pstYoloParam->u32ClassNum = 80;
    pstYoloParam->au32GridNumHeight[0] = 13;
    pstYoloParam->au32GridNumHeight[1] = 26;
    pstYoloParam->au32GridNumHeight[2] = 52;
    pstYoloParam->au32GridNumWidth[0] = 13;
    pstYoloParam->au32GridNumWidth[1] = 26;
    pstYoloParam->au32GridNumWidth[2] = 52;
    pstYoloParam->u32NmsThresh = (HI_U32)(0.3f * SVP_NNIE_QUANT_BASE);
    pstYoloParam->u32ConfThresh = (HI_U32)(0.5f * SVP_NNIE_QUANT_BASE);
    pstYoloParam->u32MaxRoiNum = 10;
    pstYoloParam->af32Bias[0][0] = 116;
    pstYoloParam->af32Bias[0][1] = 90;
    pstYoloParam->af32Bias[0][2] = 156;
    pstYoloParam->af32Bias[0][3] = 198;
    pstYoloParam->af32Bias[0][4] = 373;
    pstYoloParam->af32Bias[0][5] = 326;
    pstYoloParam->af32Bias[1][0] = 30;
    pstYoloParam->af32Bias[1][1] = 61;
    pstYoloParam->af32Bias[1][2] = 62;
    pstYoloParam->af32Bias[1][3] = 45;
    pstYoloParam->af32Bias[1][4] = 59;
    pstYoloParam->af32Bias[1][5] = 119;
    pstYoloParam->af32Bias[2][0] = 10;
    pstYoloParam->af32Bias[2][1] = 13;
    pstYoloParam->af32Bias[2][2] = 16;
    pstYoloParam->af32Bias[2][3] = 30;
    pstYoloParam->af32Bias[2][4] = 33;
    pstYoloParam->af32Bias[2][5] = 23;

    HI_U32 u32ClassNum = pstYoloParam->u32ClassNum + 1;

    if (SVP_YOLO_REPORT_BLOB_NUM != pstNnieParam->stModel.astSeg[0].u16DstNum)
    {
        printf("[FAIL] pstNnieParam->stModel.astSeg[0].u16DstNum should be euqal to %d\n", SVP_YOLO_REPORT_BLOB_NUM);
        return HI_FAILURE;
    }

    HI_U64 u64Tmp = 0;
    HI_U64 u64MaxBlobSize = 0;
    HI_U64 u64DstBlobSize = 0;
    HI_U64 u64TotalBboxNum = 0;
    for (HI_S32 i = 0; i < pstNnieParam->stModel.astSeg[0].u16DstNum; i++)
    {
        u64DstBlobSize = pstNnieParam->stModel.astSeg[0].astDstNode[i].unShape.stWhc.u32Width * sizeof(HI_U32);
        if (u64DstBlobSize > SVP_NNIE_MAX_MEM)
        {
            printf("[FAIL] u64DstBlobSize should be less than %x\n", SVP_NNIE_MAX_MEM);
            return HI_FAILURE;
        }

        u64DstBlobSize *= pstNnieParam->stModel.astSeg[0].astDstNode[i].unShape.stWhc.u32Height;
        if (u64DstBlobSize > SVP_NNIE_MAX_MEM)
        {
            printf("[FAIL] u64DstBlobSize should be less than %x\n", SVP_NNIE_MAX_MEM);
            return HI_FAILURE;
        }

        u64DstBlobSize *= pstNnieParam->stModel.astSeg[0].astDstNode[i].unShape.stWhc.u32Chn;
        if (u64DstBlobSize > SVP_NNIE_MAX_MEM)
        {
            printf("[FAIL] u64DstBlobSize should be less than %x\n", SVP_NNIE_MAX_MEM);
            return HI_FAILURE;
        }

        if (u64MaxBlobSize < u64DstBlobSize)
        {
            u64MaxBlobSize = u64DstBlobSize;
        }

        u64Tmp = (HI_U64)pstYoloParam->au32GridNumWidth[i] * pstYoloParam->au32GridNumHeight[i];
        if (u64Tmp > SVP_NNIE_MAX_MEM)
        {
            printf("[FAIL] u64Tmp should be less than %x\n", SVP_NNIE_MAX_MEM);
            return HI_FAILURE;
        }

        u64Tmp *= pstYoloParam->u32BboxNumEachGrid;
        if (u64Tmp > SVP_NNIE_MAX_MEM)
        {
            printf("[FAIL] u64Tmp should be less than %x\n", SVP_NNIE_MAX_MEM);
            return HI_FAILURE;
        }

        u64TotalBboxNum += u64Tmp;
        if (u64TotalBboxNum > SVP_NNIE_MAX_MEM)
        {
            printf("[FAIL] u64TotalBboxNum should be less than %x\n", SVP_NNIE_MAX_MEM);
            return HI_FAILURE;
        }
    }

    HI_U64 u64AssistStackSize = u64TotalBboxNum * sizeof(SVP_NNIE_STACK_S);
    if (u64AssistStackSize > SVP_NNIE_MAX_MEM)
    {
        printf("[FAIL] u64AssistStackSize should be less than %x\n", SVP_NNIE_MAX_MEM);
        return HI_FAILURE;
    }

    HI_U64 u64TotalBboxSize = u64TotalBboxNum * sizeof(SVP_YOLO_BBOX_S);
    if (u64TotalBboxSize > SVP_NNIE_MAX_MEM)
    {
        printf("[FAIL] u64TotalBboxSize should be less than %x\n", SVP_NNIE_MAX_MEM);
        return HI_FAILURE;
    }

    HI_U64 u64TotalSize = u64MaxBlobSize + u64AssistStackSize + u64TotalBboxSize;
    if (u64TotalSize > SVP_NNIE_MAX_MEM)
    {
        printf("[FAIL] u64TotalSize should be less than %x\n", SVP_NNIE_MAX_MEM);
        return HI_FAILURE;
    }

    HI_U32 u32TmpBufTotalSize = (HI_U32)u64TotalSize;

    HI_U32 u32DstRoiSize = ALIGN(u32ClassNum * pstYoloParam->u32MaxRoiNum * sizeof(HI_U32) * SVP_NNIE_COORDI_NUM);
    HI_U32 u32DstScoreSize = ALIGN(u32ClassNum * pstYoloParam->u32MaxRoiNum * sizeof(HI_U32));
    HI_U32 u32ClassRoiNumSize = ALIGN(u32ClassNum * sizeof(HI_U32));
    HI_U32 u32TotalSize = u32DstRoiSize + u32DstScoreSize + u32ClassRoiNumSize + u32TmpBufTotalSize;

    HI_U64 u64PhyAddr = 0;
    HI_U8 *pu8VirAddr = HI_NULL;
    s32Ret = HI_MPI_SYS_MmzAlloc((HI_U64 *)&u64PhyAddr, (HI_VOID **)&pu8VirAddr, "YOLO_INIT", HI_NULL, u32TotalSize);
    if (HI_SUCCESS != s32Ret)
    {
        printf("[FAIL] HI_MPI_SYS_MmzAlloc: %x\n", s32Ret);
        return HI_FAILURE;
    }
    memset_s(pu8VirAddr, u32TotalSize, 0, u32TotalSize);
    HI_MPI_SYS_MmzFlushCache(u64PhyAddr, (HI_VOID *)pu8VirAddr, u32TotalSize);

    pstYoloParam->stGetResultTmpBuf.u64PhyAddr = u64PhyAddr;
    pstYoloParam->stGetResultTmpBuf.u64VirAddr = (HI_U64)(HI_UINTPTR_T)pu8VirAddr;

    pstYoloParam->stDstRoi.enType = SVP_BLOB_TYPE_S32;
    pstYoloParam->stDstRoi.u64PhyAddr = u64PhyAddr + u32TmpBufTotalSize;
    pstYoloParam->stDstRoi.u64VirAddr = (HI_U64)(HI_UINTPTR_T)(pu8VirAddr + u32TmpBufTotalSize);
    pstYoloParam->stDstRoi.u32Stride = ALIGN(u32ClassNum * pstYoloParam->u32MaxRoiNum * sizeof(HI_U32) * SVP_NNIE_COORDI_NUM);
    pstYoloParam->stDstRoi.u32Num = 1;
    pstYoloParam->stDstRoi.unShape.stWhc.u32Chn = 1;
    pstYoloParam->stDstRoi.unShape.stWhc.u32Height = 1;
    pstYoloParam->stDstRoi.unShape.stWhc.u32Width = u32ClassNum * pstYoloParam->u32MaxRoiNum * SVP_NNIE_COORDI_NUM;

    pstYoloParam->stDstScore.enType = SVP_BLOB_TYPE_S32;
    pstYoloParam->stDstScore.u64PhyAddr = u64PhyAddr + u32TmpBufTotalSize + u32DstRoiSize;
    pstYoloParam->stDstScore.u64VirAddr = (HI_U64)(HI_UINTPTR_T)(pu8VirAddr + u32TmpBufTotalSize + u32DstRoiSize);
    pstYoloParam->stDstScore.u32Stride = ALIGN(u32ClassNum * pstYoloParam->u32MaxRoiNum * sizeof(HI_U32));
    pstYoloParam->stDstScore.u32Num = 1;
    pstYoloParam->stDstScore.unShape.stWhc.u32Chn = 1;
    pstYoloParam->stDstScore.unShape.stWhc.u32Height = 1;
    pstYoloParam->stDstScore.unShape.stWhc.u32Width = u32ClassNum * pstYoloParam->u32MaxRoiNum;

    pstYoloParam->stClassRoiNum.enType = SVP_BLOB_TYPE_S32;
    pstYoloParam->stClassRoiNum.u64PhyAddr = u64PhyAddr + u32TmpBufTotalSize + u32DstRoiSize + u32DstScoreSize;
    pstYoloParam->stClassRoiNum.u64VirAddr = (HI_U64)(HI_UINTPTR_T)(pu8VirAddr + u32TmpBufTotalSize + u32DstRoiSize + u32DstScoreSize);
    pstYoloParam->stClassRoiNum.u32Stride = ALIGN(u32ClassNum * sizeof(HI_U32));
    pstYoloParam->stClassRoiNum.u32Num = 1;
    pstYoloParam->stClassRoiNum.unShape.stWhc.u32Chn = 1;
    pstYoloParam->stClassRoiNum.unShape.stWhc.u32Height = 1;
    pstYoloParam->stClassRoiNum.unShape.stWhc.u32Width = u32ClassNum;

    return HI_SUCCESS;
}

HI_VOID SVP_NNIE_Yolo_DeInit(SVP_YOLO_PARAM_S *pstYoloParam)
{
    if (pstYoloParam == HI_NULL)
    {
        printf("[FAIL] pstYoloParam is NULL\n");
        return;
    }

    if (pstYoloParam->stGetResultTmpBuf.u64PhyAddr != 0 &&
        pstYoloParam->stGetResultTmpBuf.u64VirAddr != 0)
    {
        HI_MPI_SYS_MmzFree(pstYoloParam->stGetResultTmpBuf.u64PhyAddr,
                           (HI_VOID *)(HI_UINTPTR_T)pstYoloParam->stGetResultTmpBuf.u64VirAddr);
        pstYoloParam->stGetResultTmpBuf.u64PhyAddr = 0;
        pstYoloParam->stGetResultTmpBuf.u64VirAddr = 0;
        pstYoloParam->stDstRoi.u64PhyAddr = 0;
        pstYoloParam->stDstRoi.u64VirAddr = 0;
        pstYoloParam->stDstScore.u64PhyAddr = 0;
        pstYoloParam->stDstScore.u64VirAddr = 0;
        pstYoloParam->stClassRoiNum.u64PhyAddr = 0;
        pstYoloParam->stClassRoiNum.u64VirAddr = 0;
    }
}

inline HI_VOID SVP_NNIE_Sigmoid_Inplace(HI_FLOAT *pf32Src, HI_U32 u32Num)
{
    for (HI_S32 i = 0; i < u32Num; i++)
    {
        pf32Src[i] = SVP_NNIE_SIGMOID(pf32Src[i]);
    }
}

inline HI_FLOAT SVP_NNIE_GetMaxVal(HI_FLOAT *pf32Val, HI_U32 u32Num, HI_U32 *pu32MaxValueIndex)
{
    HI_FLOAT f32MaxTmp = pf32Val[0];
    *pu32MaxValueIndex = 0;
    for (HI_S32 i = 1; i < u32Num; i++)
    {
        if (pf32Val[i] > f32MaxTmp)
        {
            f32MaxTmp = pf32Val[i];
            *pu32MaxValueIndex = i;
        }
    }

    return f32MaxTmp;
}

inline HI_DOUBLE SVP_Yolo_Iou(SVP_YOLO_BBOX_S *pstBbox1, SVP_YOLO_BBOX_S *pstBbox2)
{
    HI_FLOAT f32InterWidth = SVP_NNIE_MIN(pstBbox1->f32Xmax, pstBbox2->f32Xmax) - SVP_NNIE_MAX(pstBbox1->f32Xmin, pstBbox2->f32Xmin);
    HI_FLOAT f32InterHeight = SVP_NNIE_MIN(pstBbox1->f32Ymax, pstBbox2->f32Ymax) - SVP_NNIE_MAX(pstBbox1->f32Ymin, pstBbox2->f32Ymin);
    if (f32InterWidth <= 0 || f32InterHeight <= 0)
    {
        return 0;
    }

    HI_DOUBLE f64InterArea = f32InterWidth * f32InterHeight;
    HI_DOUBLE f64Box1Area = (pstBbox1->f32Xmax - pstBbox1->f32Xmin) * (pstBbox1->f32Ymax - pstBbox1->f32Ymin);
    HI_DOUBLE f64Box2Area = (pstBbox2->f32Xmax - pstBbox2->f32Xmin) * (pstBbox2->f32Ymax - pstBbox2->f32Ymin);
    HI_DOUBLE f64UnionArea = f64Box1Area + f64Box2Area - f64InterArea;

    return f64InterArea / f64UnionArea;
}

inline HI_VOID SVP_Yolo_NonMaxSuppression(SVP_YOLO_BBOX_S *pstBbox, HI_U32 u32BboxNum, HI_U32 u32NmsThresh, HI_U32 u32MaxRoiNum)
{
    HI_DOUBLE f64Iou = 0.0;
    HI_U32 u32Num = 0;
    for (HI_S32 i = 0; i < u32BboxNum && u32Num < u32MaxRoiNum; i++)
    {
        if (pstBbox[i].u32Mask == 0)
        {
            u32Num++;
            for (HI_S32 j = i + 1; j < u32BboxNum; j++)
            {
                if (pstBbox[j].u32Mask == 0)
                {
                    f64Iou = SVP_Yolo_Iou(&pstBbox[i], &pstBbox[j]);
                    if (f64Iou >= (HI_DOUBLE)u32NmsThresh / SVP_NNIE_QUANT_BASE)
                    {
                        pstBbox[j].u32Mask = 1;
                    }
                }
            }
        }
    }
}

inline HI_VOID SVP_Yolo_Argswap(HI_S32 *ps32Src1, HI_S32 *ps32Src2, HI_U32 u32ArraySize)
{
    HI_S32 s32Tmp = 0;
    for (HI_S32 i = 0; i < u32ArraySize; i++)
    {
        s32Tmp = ps32Src1[i];
        ps32Src1[i] = ps32Src2[i];
        ps32Src2[i] = s32Tmp;
    }
}

inline HI_VOID SVP_Yolo_NonRecursiveArgQuickSort(HI_S32 *ps32Array, HI_S32 s32Low, HI_S32 s32High, HI_U32 u32ArraySize, HI_U32 u32ScoreIdx, SVP_NNIE_STACK_S *pstStack)
{
    HI_S32 i = s32Low;
    HI_S32 j = s32High;
    HI_S32 s32Top = 0;
    HI_S32 s32KeyConfidence = ps32Array[u32ArraySize * s32Low + u32ScoreIdx];
    pstStack[s32Top].s32Min = s32Low;
    pstStack[s32Top].s32Max = s32High;

    while (s32Top > -1)
    {
        s32Low = pstStack[s32Top].s32Min;
        s32High = pstStack[s32Top].s32Max;
        i = s32Low;
        j = s32High;
        s32Top--;

        s32KeyConfidence = ps32Array[u32ArraySize * s32Low + u32ScoreIdx];

        while (i < j)
        {
            while (i < j && s32KeyConfidence > ps32Array[j * u32ArraySize + u32ScoreIdx])
            {
                j--;
            }
            if (i < j)
            {
                SVP_Yolo_Argswap(&ps32Array[i * u32ArraySize], &ps32Array[j * u32ArraySize], u32ArraySize);
                i++;
            }

            while (i < j && s32KeyConfidence < ps32Array[i * u32ArraySize + u32ScoreIdx])
            {
                i++;
            }
            if (i < j)
            {
                SVP_Yolo_Argswap(&ps32Array[i * u32ArraySize], &ps32Array[j * u32ArraySize], u32ArraySize);
                j--;
            }
        }

        if (s32Low < i - 1)
        {
            s32Top++;
            pstStack[s32Top].s32Min = s32Low;
            pstStack[s32Top].s32Max = i - 1;
        }

        if (s32High > i + 1)
        {
            s32Top++;
            pstStack[s32Top].s32Min = i + 1;
            pstStack[s32Top].s32Max = s32High;
        }
    }
}

HI_VOID SVP_NNIE_Detection_PrintResult(SVP_BLOB_S *pstDstScore, SVP_BLOB_S *pstDstRoi, SVP_BLOB_S *pstClassRoiNum, HI_FLOAT f32Threshold)
{
    HI_S32 *ps32Score = (HI_S32 *)(HI_UINTPTR_T)pstDstScore->u64VirAddr;
    HI_S32 *ps32Roi = (HI_S32 *)(HI_UINTPTR_T)pstDstRoi->u64VirAddr;
    HI_S32 *ps32ClassRoiNum = (HI_S32 *)(HI_UINTPTR_T)pstClassRoiNum->u64VirAddr;
    HI_U32 u32ClassNum = pstClassRoiNum->unShape.stWhc.u32Width;

    HI_U32 u32RoiNumBias = ps32ClassRoiNum[0];
    HI_S32 s32XMin = 0, s32YMin = 0, s32XMax = 0, s32YMax = 0;
    for (HI_S32 i = 1; i < u32ClassNum; i++)
    {
        HI_U32 u32ScoreBias = u32RoiNumBias;
        HI_U32 u32BboxBias = u32RoiNumBias * SVP_NNIE_COORDI_NUM;
        if ((HI_FLOAT)ps32Score[u32ScoreBias] / SVP_NNIE_QUANT_BASE >= f32Threshold && ps32ClassRoiNum[i] != 0)
        {
            printf("[INFO] the %d'th class bbox:\n", i);
        }

        for (HI_S32 j = 0; j < (HI_U32)ps32ClassRoiNum[i]; j++)
        {
            HI_FLOAT f32Score = (HI_FLOAT)ps32Score[u32ScoreBias + j] / SVP_NNIE_QUANT_BASE;
            if (f32Score < f32Threshold)
            {
                break;
            }
            s32XMin = ps32Roi[u32BboxBias + j * SVP_NNIE_COORDI_NUM];
            s32YMin = ps32Roi[u32BboxBias + j * SVP_NNIE_COORDI_NUM + 1];
            s32XMax = ps32Roi[u32BboxBias + j * SVP_NNIE_COORDI_NUM + 2];
            s32YMax = ps32Roi[u32BboxBias + j * SVP_NNIE_COORDI_NUM + 3];
            printf(" - xmin: %d, ymin: %d, xmax: %d, y_max: %d, score: %f.\n", s32XMin, s32YMin, s32XMax, s32YMax, f32Score);
        }
        u32RoiNumBias += ps32ClassRoiNum[i];
    }
}

HI_S32 SVP_YOLO_PostProcess(SVP_NNIE_PARAM_S *pstNnieParam, SVP_YOLO_PARAM_S *pstYoloParam, HI_BOOL bVerbose)
{
    HI_S32 s32Ret;

    if (pstNnieParam == HI_NULL)
    {
        printf("[FAIL] pstNnieParam is NULL\n");
        return HI_FAILURE;
    }

    if (pstYoloParam == HI_NULL)
    {
        printf("[FAIL] pstYoloParam is NULL\n");
        return HI_FAILURE;
    }

    HI_U32 u32BlobSize = 0;
    HI_U32 u32MaxBlobSize = 0;
    for (HI_S32 i = 0; i < SVP_YOLO_REPORT_BLOB_NUM; i++)
    {
        u32BlobSize = pstYoloParam->au32GridNumWidth[i] * pstYoloParam->au32GridNumHeight[i] * sizeof(HI_U32) *
                      SVP_YOLO_EACH_BBOX_INFER_RESULT_NUM * pstYoloParam->u32BboxNumEachGrid;
        if (u32MaxBlobSize < u32BlobSize)
        {
            u32MaxBlobSize = u32BlobSize;
        }
    }

    HI_U32 u32TotalBboxNum = 0;
    for (HI_S32 i = 0; i < SVP_YOLO_REPORT_BLOB_NUM; i++)
    {
        u32TotalBboxNum += pstYoloParam->au32GridNumWidth[i] * pstYoloParam->au32GridNumHeight[i] * pstYoloParam->u32BboxNumEachGrid;
    }

    HI_FLOAT *pf32Permute = (HI_FLOAT *)(HI_UINTPTR_T)pstYoloParam->stGetResultTmpBuf.u64VirAddr;
    SVP_YOLO_BBOX_S *pstBbox = (SVP_YOLO_BBOX_S *)(pf32Permute + u32MaxBlobSize / sizeof(HI_U32));
    HI_S32 *ps32AssistBuf = (HI_S32 *)(pstBbox + u32TotalBboxNum);

    HI_U32 u32BboxNum = 0;
    for (HI_S32 i = 0; i < SVP_YOLO_REPORT_BLOB_NUM; i++)
    {
        HI_U32 u32Offset = 0;
        HI_S32 *ps32InputBlob = (HI_S32 *)(HI_UINTPTR_T)pstNnieParam->astSegData[0].astDst[i].u64VirAddr;
        HI_U32 u32ChnOffset = pstYoloParam->au32GridNumHeight[i] * pstNnieParam->astSegData[0].astDst[i].u32Stride / sizeof(HI_U32);
        HI_U32 u32HeightOffset = pstNnieParam->astSegData[0].astDst[i].u32Stride / sizeof(HI_U32);
        for (HI_S32 h = 0; h < pstYoloParam->au32GridNumHeight[i]; h++)
        {
            for (HI_S32 w = 0; w < pstYoloParam->au32GridNumWidth[i]; w++)
            {
                for (HI_S32 c = 0; c < SVP_YOLO_EACH_BBOX_INFER_RESULT_NUM * pstYoloParam->u32BboxNumEachGrid; c++)
                {
                    pf32Permute[u32Offset++] = (HI_FLOAT)(ps32InputBlob[c * u32ChnOffset + h * u32HeightOffset + w]) / SVP_NNIE_QUANT_BASE;
                }
            }
        }

        for (HI_S32 j = 0; j < pstYoloParam->au32GridNumWidth[i] * pstYoloParam->au32GridNumHeight[i]; j++)
        {
            HI_U32 u32GridXIdx = j % pstYoloParam->au32GridNumWidth[i];
            HI_U32 u32GridYIdx = j / pstYoloParam->au32GridNumWidth[i];
            for (HI_S32 k = 0; k < pstYoloParam->u32BboxNumEachGrid; k++)
            {
                HI_U32 u32MaxValueIndex = 0;
                HI_U32 u32Offset = (j * pstYoloParam->u32BboxNumEachGrid + k) * SVP_YOLO_EACH_BBOX_INFER_RESULT_NUM;

                HI_FLOAT f32StartX =
                    ((HI_FLOAT)u32GridXIdx + SVP_NNIE_SIGMOID(pf32Permute[u32Offset + 0])) / pstYoloParam->au32GridNumWidth[i];
                HI_FLOAT f32StartY = ((HI_FLOAT)u32GridYIdx + SVP_NNIE_SIGMOID(pf32Permute[u32Offset + 1])) / pstYoloParam->au32GridNumHeight[i];

                if (pstYoloParam->u32OriImWidth == 0 || pstYoloParam->u32OriImHeight == 0)
                {
                    printf("[FAIL] Divisor pstYoloParam->u32OriImWidth or pstYoloParam->u32OriImHeight cannot be 0!\n");
                    return HI_FAILURE;
                }
                HI_FLOAT f32Width = (HI_FLOAT)(exp(pf32Permute[u32Offset + SVP_NNIE_X_MAX_OFFSET]) *
                                               pstYoloParam->af32Bias[i][2 * k]) /
                                    pstYoloParam->u32OriImWidth;
                HI_FLOAT f32Height = (HI_FLOAT)(exp(pf32Permute[u32Offset + SVP_NNIE_Y_MAX_OFFSET]) *
                                                pstYoloParam->af32Bias[i][2 * k + 1]) /
                                     pstYoloParam->u32OriImHeight;

                SVP_NNIE_Sigmoid_Inplace(&pf32Permute[u32Offset + SVP_NNIE_SCORE_OFFSET], (pstYoloParam->u32ClassNum + 1));
                HI_FLOAT f32ObjScore = pf32Permute[u32Offset + SVP_NNIE_SCORE_OFFSET];
                HI_FLOAT f32MaxScore = SVP_NNIE_GetMaxVal(&pf32Permute[u32Offset + SVP_NNIE_SUPPRESS_FLAG_OFFSET],
                                                          pstYoloParam->u32ClassNum, &u32MaxValueIndex);
                HI_S32 s32ClassScore = (HI_S32)(f32MaxScore * f32ObjScore * SVP_NNIE_QUANT_BASE);

                if ((HI_U32)s32ClassScore > pstYoloParam->u32ConfThresh)
                {
                    pstBbox[u32BboxNum].f32Xmin = (HI_FLOAT)(f32StartX - f32Width * 0.5f);
                    pstBbox[u32BboxNum].f32Ymin = (HI_FLOAT)(f32StartY - f32Height * 0.5f);
                    pstBbox[u32BboxNum].f32Xmax = (HI_FLOAT)(f32StartX + f32Width * 0.5f);
                    pstBbox[u32BboxNum].f32Ymax = (HI_FLOAT)(f32StartY + f32Height * 0.5f);
                    pstBbox[u32BboxNum].s32ClsScore = s32ClassScore;
                    pstBbox[u32BboxNum].u32Mask = 0;
                    pstBbox[u32BboxNum].u32ClassIdx = (HI_S32)(u32MaxValueIndex + 1);
                    u32BboxNum++;
                }
            }
        }
    }

    if (u32BboxNum >= 1)
    {
        SVP_Yolo_NonRecursiveArgQuickSort((HI_S32 *)pstBbox, 0, u32BboxNum - 1, sizeof(SVP_YOLO_BBOX_S) / sizeof(HI_U32), 4, (SVP_NNIE_STACK_S *)ps32AssistBuf);
    }
    SVP_Yolo_NonMaxSuppression(pstBbox, u32BboxNum, pstYoloParam->u32NmsThresh, u32BboxNum);

    HI_S32 *ps32DstRoi = (HI_S32 *)(HI_UINTPTR_T)pstYoloParam->stDstRoi.u64VirAddr;
    HI_S32 *ps32ClassRoiNum = (HI_S32 *)(HI_UINTPTR_T)pstYoloParam->stClassRoiNum.u64VirAddr;
    HI_S32 *ps32DstScore = (HI_S32 *)(HI_UINTPTR_T)pstYoloParam->stDstScore.u64VirAddr;
    for (HI_S32 i = 1; i < pstYoloParam->u32ClassNum + 1; i++)
    {
        HI_U32 u32ClassRoiNum = 0;
        for (HI_S32 j = 0; j < u32BboxNum; j++)
        {
            if (pstBbox[j].u32Mask == 0 && i == pstBbox[j].u32ClassIdx && u32ClassRoiNum < pstYoloParam->u32MaxRoiNum)
            {
                *(ps32DstRoi++) = SVP_NNIE_MAX((HI_S32)(pstBbox[j].f32Xmin * pstYoloParam->u32OriImWidth), 0);
                *(ps32DstRoi++) = SVP_NNIE_MAX((HI_S32)(pstBbox[j].f32Ymin * pstYoloParam->u32OriImHeight), 0);
                *(ps32DstRoi++) = SVP_NNIE_MIN((HI_S32)(pstBbox[j].f32Xmax * pstYoloParam->u32OriImWidth), (HI_S32)pstYoloParam->u32OriImWidth);
                *(ps32DstRoi++) = SVP_NNIE_MIN((HI_S32)(pstBbox[j].f32Ymax * pstYoloParam->u32OriImHeight), (HI_S32)pstYoloParam->u32OriImHeight);
                *(ps32DstScore++) = pstBbox[j].s32ClsScore;
                u32ClassRoiNum++;
            }
        }
        *(ps32ClassRoiNum + i) = u32ClassRoiNum;
    }

    if (bVerbose)
    {
        SVP_NNIE_Detection_PrintResult(&pstYoloParam->stDstScore, &pstYoloParam->stDstRoi, &pstYoloParam->stClassRoiNum, SVP_YOLO_VERBOSE_THRESHOLD);
    }

    return HI_SUCCESS;
}

inline HI_S32 SVP_SR_PixelShuffle(SVP_NNIE_PARAM_S *pstNnieParam, IVE_IMAGE_S *pstSrcImg, IVE_IMAGE_S *pstDstImg)
{
    HI_S32 s32Ret;

    HI_U32 u32Stride = pstSrcImg->au32Stride[0];
    HI_U8 *pu8Addr = (HI_U8 *)(HI_UINTPTR_T)pstSrcImg->au64VirAddr[0];

    if (pstNnieParam->astSegData[0].astDst[0].unShape.stWhc.u32Chn != 48)
    {
        printf("[FAIL] pstNnieParam->astSegData[0].astDst[0].unShape.stWhc.u32Chn should be %d\n", 48);
        return HI_FALIURE;
    }

    HI_U32 u32InChnSize = ALIGN(pstNnieParam->astSegData[0].astDst[0].unShape.stWhc.u32Width) * pstNnieParam->astSegData[0].astDst[0].unShape.stWhc.u32Height;
    HI_S32 *u32InChnAddr[48] = {0};
    u32InChnAddr[0] = (HI_S32 *)(HI_UINTPTR_T)pstNnieParam->astSegData[0].astDst[0].u64VirAddr;
    for(HI_S32 i = 1; i < 48; i++)
    {
        u32InChnAddr[i] = u32InChnAddr[i-1] + u32InChnSize;
    }

    HI_U8 *pu8OutPtr[SVP_SR_UPSCALE_FACTOR] = {0};

    for(HI_S32 i = 0; i < pstNnieParam->astSegData[0].astSrc[0].unShape.stWhc.u32Height; i++)
    {
        for(HI_S32 j = 0; j < SVP_SR_UPSCALE_FACTOR; j++)
        {
            pu8OutPtr[j] = pu8Addr;
            pu8Addr += u32Stride;
        }
        for(HI_S32 j = 0; j < pstNnieParam->astSegData[0].astSrc[0].unShape.stWhc.u32Width / (SVP_SR_UPSCALE_FACTOR * SVP_SR_UPSCALE_FACTOR); j++)
        {
            for(HI_S32 k = 0; k < SVP_SR_UPSCALE_FACTOR; k++)
            {
                asm volatile
                (
                    "pld [%[src0], #512] \n"
                    "vldm %[src0]!, {d0-d7} \n"

                    "pld [%[src1], #512] \n"
                    "vldm %[src1]!, {d8-d15} \n"

                    "pld [%[src2], #512] \n"
                    "vldm %[src2]!, {d16-d23} \n"

                    "pld [%[src3], #512] \n"
                    "vldm %[src3]!, {d24-d31} \n"

                    "vshrn.i32 d0, q0, #0xC \n"
                    "vshrn.i32 d1, q1, #0xC \n"
                    "vshrn.i32 d2, q2, #0xC \n"
                    "vshrn.i32 d3, q3, #0xC \n"

                    "vshrn.i32 d4, q4, #0xC \n"
                    "vshrn.i32 d5, q5, #0xC \n"
                    "vshrn.i32 d6, q6, #0xC \n"
                    "vshrn.i32 d7, q7, #0xC \n"

                    "vshrn.i32 d8, q8, #0xC \n"
                    "vshrn.i32 d9, q9, #0xC \n"
                    "vshrn.i32 d10, q10, #0xC \n"
                    "vshrn.i32 d11, q11, #0xC \n"

                    "vshrn.i32 d12, q12, #0xC \n"
                    "vshrn.i32 d13, q13, #0xC \n"
                    "vshrn.i32 d14, q14, #0xC \n"
                    "vshrn.i32 d15, q15, #0xC \n"

                    "vqmovun.s16 d0, q0 \n"
                    "vqmovun.s16 d1, q1 \n"

                    "vqmovun.s16 d2, q2 \n" 
                    "vqmovun.s16 d3, q3 \n"

                    "vqmovun.s16 d4, q4 \n"
                    "vqmovun.s16 d5, q5 \n"

                    "vqmovun.s16 d6, q6 \n" 
                    "vqmovun.s16 d7, q7 \n"

                    "vst4.8 {d0,d2,d4,d6}, [%[dst]]! \n"
                    "vst4.8 {d1,d3,d5,d7}, [%[dst]]! \n"
                    :
                    [dst] "+r" (pu8OutPtr[k]),
                    [src0] "+r" (u32InChnAddr[k * 4]),
                    [src1] "+r" (u32InChnAddr[k * 4 + 1]),
                    [src2] "+r" (u32InChnAddr[k * 4 + 2]),
                    [src3] "+r" (u32InChnAddr[k * 4 + 3])
                    :
                    : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
                );                  
            }
        }
    }
    for(HI_S32 i = 0; i < pstNnieParam->astSegData[0].astSrc[0].unShape.stWhc.u32Height; i++)
    {
        for(HI_S32 j = 0; j < SVP_SR_UPSCALE_FACTOR; j++)
        {
            pu8OutPtr[j] = pu8Addr;
            pu8Addr += u32Stride;
        }
        for(HI_S32 j = 0; j < pstNnieParam->astSegData[0].astSrc[0].unShape.stWhc.u32Width / (SVP_SR_UPSCALE_FACTOR * SVP_SR_UPSCALE_FACTOR); j++)
        {
            for(HI_S32 k = 0; k < SVP_SR_UPSCALE_FACTOR; k++)
            {
                asm volatile
                (
                    "pld [%[src0], #512] \n"
                    "vldm %[src0]!, {d0-d7} \n"

                    "pld [%[src1], #512] \n"
                    "vldm %[src1]!, {d8-d15} \n"

                    "pld [%[src2], #512] \n"
                    "vldm %[src2]!, {d16-d23} \n"

                    "pld [%[src3], #512] \n"
                    "vldm %[src3]!, {d24-d31} \n"

                    "vshrn.i32 d0, q0, #0xC \n"
                    "vshrn.i32 d1, q1, #0xC \n"
                    "vshrn.i32 d2, q2, #0xC \n"
                    "vshrn.i32 d3, q3, #0xC \n"

                    "vshrn.i32 d4, q4, #0xC \n"
                    "vshrn.i32 d5, q5, #0xC \n"
                    "vshrn.i32 d6, q6, #0xC \n"
                    "vshrn.i32 d7, q7, #0xC \n"

                    "vshrn.i32 d8, q8, #0xC \n"
                    "vshrn.i32 d9, q9, #0xC \n"
                    "vshrn.i32 d10, q10, #0xC \n"
                    "vshrn.i32 d11, q11, #0xC \n"

                    "vshrn.i32 d12, q12, #0xC \n"
                    "vshrn.i32 d13, q13, #0xC \n"
                    "vshrn.i32 d14, q14, #0xC \n"
                    "vshrn.i32 d15, q15, #0xC \n"

                    "vqmovun.s16 d0, q0 \n"
                    "vqmovun.s16 d1, q1 \n"

                    "vqmovun.s16 d2, q2 \n" 
                    "vqmovun.s16 d3, q3 \n"

                    "vqmovun.s16 d4, q4 \n"
                    "vqmovun.s16 d5, q5 \n"

                    "vqmovun.s16 d6, q6 \n" 
                    "vqmovun.s16 d7, q7 \n"

                    "vst4.8 {d0,d2,d4,d6}, [%[dst]]! \n"
                    "vst4.8 {d1,d3,d5,d7}, [%[dst]]! \n"
                    :
                    [dst] "+r" (pu8OutPtr[k]),
                    [src0] "+r" (u32InChnAddr[k * 4 + 16]),
                    [src1] "+r" (u32InChnAddr[k * 4 + 17]),
                    [src2] "+r" (u32InChnAddr[k * 4 + 18]),
                    [src3] "+r" (u32InChnAddr[k * 4 + 19])
                    :
                    : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
                );  
            }
        }
    }
    for(HI_S32 i = 0; i < pstNnieParam->astSegData[0].astSrc[0].unShape.stWhc.u32Height; i++)
    {
        for(HI_S32 j = 0; j < SVP_SR_UPSCALE_FACTOR; j++)
        {
            pu8OutPtr[j] = pu8Addr;
            pu8Addr += u32Stride;
        }
        for(HI_S32 j = 0; j < pstNnieParam->astSegData[0].astSrc[0].unShape.stWhc.u32Width / (SVP_SR_UPSCALE_FACTOR * SVP_SR_UPSCALE_FACTOR); j++)
        {
            for(HI_S32 k = 0; k < SVP_SR_UPSCALE_FACTOR; k++)
            {
                asm volatile
                (
                    "pld [%[src0], #512] \n"
                    "vldm %[src0]!, {d0-d7} \n"

                    "pld [%[src1], #512] \n"
                    "vldm %[src1]!, {d8-d15} \n"

                    "pld [%[src2], #512] \n"
                    "vldm %[src2]!, {d16-d23} \n"

                    "pld [%[src3], #512] \n"
                    "vldm %[src3]!, {d24-d31} \n"

                    "vshrn.i32 d0, q0, #0xC \n"
                    "vshrn.i32 d1, q1, #0xC \n"
                    "vshrn.i32 d2, q2, #0xC \n"
                    "vshrn.i32 d3, q3, #0xC \n"

                    "vshrn.i32 d4, q4, #0xC \n"
                    "vshrn.i32 d5, q5, #0xC \n"
                    "vshrn.i32 d6, q6, #0xC \n"
                    "vshrn.i32 d7, q7, #0xC \n"

                    "vshrn.i32 d8, q8, #0xC \n"
                    "vshrn.i32 d9, q9, #0xC \n"
                    "vshrn.i32 d10, q10, #0xC \n"
                    "vshrn.i32 d11, q11, #0xC \n"

                    "vshrn.i32 d12, q12, #0xC \n"
                    "vshrn.i32 d13, q13, #0xC \n"
                    "vshrn.i32 d14, q14, #0xC \n"
                    "vshrn.i32 d15, q15, #0xC \n"

                    "vqmovun.s16 d0, q0 \n"
                    "vqmovun.s16 d1, q1 \n"

                    "vqmovun.s16 d2, q2 \n" 
                    "vqmovun.s16 d3, q3 \n"

                    "vqmovun.s16 d4, q4 \n"
                    "vqmovun.s16 d5, q5 \n"

                    "vqmovun.s16 d6, q6 \n" 
                    "vqmovun.s16 d7, q7 \n"

                    "vst4.8 {d0,d2,d4,d6}, [%[dst]]! \n"
                    "vst4.8 {d1,d3,d5,d7}, [%[dst]]! \n"
                    :
                    [dst] "+r" (pu8OutPtr[k]),
                    [src0] "+r" (u32InChnAddr[k * 4 + 32]),
                    [src1] "+r" (u32InChnAddr[k * 4 + 33]),
                    [src2] "+r" (u32InChnAddr[k * 4 + 34]),
                    [src3] "+r" (u32InChnAddr[k * 4 + 35])
                    :
                    : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
                );  
            }
        }
    }
    
    IVE_HANDLE hIveHandle = 0;
    IVE_CSC_CTRL_S stCSCCtrl = {IVE_CSC_MODE_VIDEO_BT709_RGB2YUV};
    s32Ret = HI_MPI_IVE_CSC(&hIveHandle, pstSrcImg, pstDstImg, &stCSCCtrl, HI_TRUE);
    if(HI_SUCCESS != s32Ret)
    {
        printf("[FAIL] HI_MPI_IVE_CSC: %x\n", s32Ret);
        return HI_FAILURE;
    }

    HI_BOOL bFinish = HI_FALSE;
    do
    {
        usleep(100);
        while(HI_ERR_IVE_QUERY_TIMEOUT == HI_MPI_IVE_Query(hIveHandle, &bFinish, HI_TRUE))
        {
            printf("[WARNING] HI_MPI_IVE_Query: HI_ERR_IVE_QUERY_TIMEOUT\n");
            usleep(100);
        }
    }while(HI_TRUE != bFinish);

    return HI_SUCCESS;  
}

HI_S32 SVP_SR_PostProcess(SVP_NNIE_PARAM_S *pstNnieParam, VIDEO_FRAME_INFO_S *pstOutFrmInfo)
{
    HI_S32 s32Ret;

    HI_U32 u32Width = pstNnieParam->astSegData[0].astSrc[0].unShape.stWhc.u32Width * SVP_SR_UPSCALE_FACTOR;
    HI_U32 u32Height = pstNnieParam->astSegData[0].astSrc[0].unShape.stWhc.u32Height * SVP_SR_UPSCALE_FACTOR;
    HI_U32 u32Stride = ALIGN(u32Width);
    HI_U32 u32ChnSize = u32Stride * u32Height;    

    IVE_IMAGE_S stSrcImg = {0};

    stSrcImg.u32Height = u32Height;
    stSrcImg.u32Width = u32Width;
    stSrcImg.au32Stride[0] = u32Stride;
    stSrcImg.au32Stride[1] = u32Stride;
    stSrcImg.au32Stride[2] = u32Stride;
    stSrcImg.enType = IVE_IMAGE_TYPE_U8C3_PLANAR;
   
    VB_BLK hSrcBlkHdl = HI_MPI_VB_GetBlock(SVP_SR_VB_ID, u32ChnSize * 3, NULL);
    if(VB_INVALID_HANDLE == hSrcBlkHdl)
    {
        printf("[FAIL] HI_MPI_VB_GetBlock failed!\n");
        return HI_FAILURE;
    }
    stSrcImg.au64PhyAddr[0] = HI_MPI_VB_Handle2PhysAddr(hSrcBlkHdl);
    if(0 == stSrcImg.au64PhyAddr[0])
    {
        printf("[FAIL] HI_MPI_VB_Handle2PhysAddr failed!\n");
        HI_MPI_VB_ReleaseBlock(hSrcBlkHdl);
        return HI_FAILURE;
    }
    stSrcImg.au64VirAddr[0] = (HI_U64)(HI_UINTPTR_T)HI_MPI_SYS_Mmap(stSrcImg.au64PhyAddr[0], u32ChnSize * 3);
    if(0 == stSrcImg.au64VirAddr[0])
    {
        printf("[FAIL] HI_MPI_SYS_Mmap failed!\n");
        HI_MPI_VB_ReleaseBlock(hSrcBlkHdl);
        return HI_FAILURE;      
    }
    stSrcImg.au64PhyAddr[1] = stSrcImg.au64PhyAddr[0] + u32ChnSize;
    stSrcImg.au64PhyAddr[2] = stSrcImg.au64PhyAddr[1] + u32ChnSize;
    stSrcImg.au64VirAddr[1] = stSrcImg.au64VirAddr[0] + u32ChnSize;
    stSrcImg.au64VirAddr[2] = stSrcImg.au64VirAddr[1] + u32ChnSize;   

    IVE_IMAGE_S stDstImg = {0};

    stDstImg.u32Height = u32Height;
    stDstImg.u32Width = u32Width;
    stDstImg.au32Stride[0] = u32Stride;
    stDstImg.au32Stride[1] = u32Stride;
    stDstImg.au32Stride[2] = u32Stride; 
    stDstImg.enType = IVE_IMAGE_TYPE_YUV420SP;
    
    VB_BLK hDstBlkHdl = HI_MPI_VB_GetBlock(SVP_SR_VB_ID, u32ChnSize * 3 / 2, NULL);
    if(VB_INVALID_HANDLE == hDstBlkHdl)
    {
        printf("[FAIL] HI_MPI_VB_GetBlock failed!\n");
        return HI_FAILURE;
    }

    stDstImg.au64PhyAddr[0] = HI_MPI_VB_Handle2PhysAddr(hDstBlkHdl);
    if(0 == stDstImg.au64PhyAddr[0])
    {
        printf("[FAIL] HI_MPI_VB_Handle2PhysAddr failed!\n");
        HI_MPI_VB_ReleaseBlock(hDstBlkHdl);
        return HI_FAILURE;
    }    

    stDstImg.au64VirAddr[0] = (HI_U64)(HI_UINTPTR_T)HI_MPI_SYS_Mmap(stDstImg.au64PhyAddr[0], u32ChnSize * 3);
    if(0 == stDstImg.au64VirAddr[0])
    {
        printf("[FAIL] HI_MPI_SYS_Mmap failed!\n");
        HI_MPI_VB_ReleaseBlock(hDstBlkHdl);
        return HI_FAILURE;      
    }

    stDstImg.au64PhyAddr[1] = stDstImg.au64PhyAddr[0] + u32ChnSize;
    stDstImg.au64PhyAddr[2] = stDstImg.au64PhyAddr[1];
    stDstImg.au64VirAddr[1] = stDstImg.au64VirAddr[0] + u32ChnSize;
    stDstImg.au64VirAddr[2] = stDstImg.au64VirAddr[1];

    memset_s(pstOutFrmInfo, sizeof(VIDEO_FRAME_INFO_S), 0, sizeof(VIDEO_FRAME_INFO_S));
    pstOutFrmInfo->enModId = HI_ID_SVP_NNIE;
    pstOutFrmInfo->u32PoolId = HI_MPI_VB_Handle2PoolId(hDstBlkHdl);
    pstOutFrmInfo->stVFrame.u32Width = u32Width;
    pstOutFrmInfo->stVFrame.u32Height = u32Height;
    pstOutFrmInfo->stVFrame.enField = VIDEO_FIELD_FRAME;
    pstOutFrmInfo->stVFrame.enPixelFormat = PIXEL_FORMAT_YVU_SEMIPLANAR_420;
    pstOutFrmInfo->stVFrame.enVideoFormat = VIDEO_FORMAT_LINEAR;
    pstOutFrmInfo->stVFrame.enCompressMode = COMPRESS_MODE_NONE;
    pstOutFrmInfo->stVFrame.enDynamicRange = DYNAMIC_RANGE_SDR8;
    pstOutFrmInfo->stVFrame.enColorGamut = COLOR_GAMUT_BT709;
    for(HI_S32 i = 0; i < 3; i++)
    {
        pstOutFrmInfo->stVFrame.u32Stride[i] = stDstImg.au32Stride[i];
        pstOutFrmInfo->stVFrame.u64PhyAddr[i] = stDstImg.au64PhyAddr[i];
        pstOutFrmInfo->stVFrame.u64VirAddr[i] = stDstImg.au64VirAddr[i];
    }

    s32Ret = SVP_SR_PixelShuffle(pstNnieParam, &stSrcImg, &stDstImg);
    if(HI_SUCCESS != s32Ret)
    {
        printf("[FAIL] SVP_SR_PixelShuffle failed!\n");
        return HI_FAILURE;
    }    

    s32Ret = HI_MPI_SYS_Munmap((HI_VOID *)(HI_UINTPTR_T)stSrcImg.au64VirAddr[0], u32ChnSize * 3);
    if(HI_SUCCESS != s32Ret)
    {
        printf("[FAIL] HI_MPI_SYS_Munmap failed!\n");
    }
    s32Ret = HI_MPI_VB_ReleaseBlock(hSrcBlkHdl);
    if(HI_SUCCESS != s32Ret)
    {
        printf("[FAIL] HI_MPI_VB_ReleaseBlock failed!\n");
    }

    return HI_SUCCESS;
}